The squared exponential kernel can be implemented in R as follows:
kernel_se <- function(
x,
y,
signal_sd = 1,
length_scale = 1
) {
sqared_distance <- crossprod(x-y) # squared euclidean distance
kernel <- signal_sd^2 * exp(-(1/(2*length_scale^2)) * sqared_distance)
return(kernel)
}
We can use this kernel function to sample from a corresponding Gaussian process as follows:
gaussian_process <- function(
X,
mu = rep(0,nrow(X)),
kernel_fun=kernel_se,
noise_sd=0.1,
...
) {
if (is.null(dim(X))) {
X <- matrix(X)
}
n <- nrow(X)
# Derive gram matrix:
K <- kernel_matrix(X, kernel_fun = kernel_fun, ...)
L <- t(chol(K + noise_sd^2 * diag(n))) # Cholesky decompose
# Compute Gaussian Process:
gp <- mu + L %*% matrix(rnorm(n),nrow=n)
# Output:
return(gp)
}
The kernel_matrix function corresponds to \(K(\mathbf{X}, \mathbf{X}')\). It simply applies the kernel function \(k(\mathbf{x},\mathbf{x}')\) to all pair-wise combinations of samples in \(\mathbf{X}\) and \(\mathbf{X}'\).
Figure 1.1 shows 5 samples drawn from a Gaussian process with the the specified parameters. The samples look rather noisy because the specified noise variance is relatively high. A smaller choice for that parameter yields samples that are much smoother (Figure 1.2).
set.seed(rand_seed)
gp_samples <- gaussian_process_sampler(noise_sd=0.1,length_scale=1,signal_sd=1)
plot(gp_samples)
Figure 1.1: Samples from a Gaussian process. Shaded area indicates the 95% confidence interval based on the variance prior. High noise variance.
set.seed(rand_seed)
gp_samples <- gaussian_process_sampler(noise_sd=0.00001,length_scale=1,signal_sd=1)
plot(gp_samples)
Figure 1.2: Samples from a Gaussian process. Shaded area indicates the 95% confidence interval based on the variance prior. Low noise variance.
To implement Gaussian Process Regression (GPReg) we first draw a small set of training points at random as well as larger set of (equidistant) test points.
set.seed(rand_seed)
n_train <- 20
X <- runif(n_train, min = -7, max = 7)
X_test <- seq(-7,7,length.out=100)
GPReg can be implemented in R as follows:
gaussian_process_regression <- function(
X,
y,
kernel_fun=kernel_se,
noise_sd=0.1,
X_test,
inv_tol=1e-30,
...
) {
# Set up: ----
# Training data:
if (is.null(dim(X))) {
X <- matrix(X)
}
n <- nrow(X)
# Test data:
if (is.null(dim(X_test))) {
X_test <- matrix(X_test)
}
n_test <- nrow(X_test)
# Derive gram matrix:
K <- kernel_matrix(X, kernel_fun = kernel_fun, ...) # Training
L <- t(chol(K + noise_sd^2 * diag(n))) # Cholesky decompose
alpha <- as.matrix(solve(crossprod(t(L)),y,tol=inv_tol))
# Iterate over test cases:
predictions <- t(
sapply(
1:n_test,
function(i) {
x_star <- matrix(X_test[i,],nrow = 1)
k_star <- kernel_matrix(X, x_star, kernel_fun = kernel_fun, ...)
predictive_mean <- crossprod(k_star, alpha)
v <- solve(L, k_star)
predictive_variance <- kernel_matrix(x_star, x_star, kernel_fun = kernel_fun, ...) - crossprod(v)
return(cbind(mean=predictive_mean, var=predictive_variance))
}
)
)
# Output:
gp_regression <- list(
predictions = predictions,
X=X,
y=y,
X_test=X_test
)
class(gp_regression) <- "gp_regression"
return(gp_regression)
}
# Methods: ----
# Plot: ----
plot.gp_regression <- function(gp_regression) {
list2env(gp_regression, envir = environment())
if (dim(X)[2]>1) {
stop("Your input is multi-dimensional, which cannot be visualized.")
}
# Predictions:
pred <- data.table(x=X_test[,1], y=predictions[,1])
# Confidence bands:
ci <- data.table(
x=X_test[,1],
lb=predictions[,1]-2*sqrt(predictions[,2]),
ub=predictions[,1]+2*sqrt(predictions[,2])
)
ci <- melt(ci, id.vars = "x", value.name = "y")
ci[,id:=1]
ci[variable=="lb",x:=rev(x)]
ci[variable=="lb",y:=rev(y)]
# Training points:
train <- data.table(x=c(X), y=c(y))
p <- ggplot() +
geom_polygon(data=ci, aes(x=x, y=y, group=id), fill="blue", alpha=0.1) +
geom_line(data=pred, aes(x=x, y=y), colour="blue") +
geom_point(data=train, aes(x=x, y=y), shape=3, colour="blue") +
labs(
x="input, x",
y="output, f(x)"
) +
scale_x_continuous(expand=c(0,0))
p
return(p)
}
# UCB: ----
acqui_ucb.gp_regression <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
list2env(gp_regression, envir = environment())
# Apply UCB
value_estimate <- predictions[,1] + exploring_rate * predictions[,2]
idx_max <- which.max(value_estimate)
if (verbose==3) {
points(fn_scale * predictions[,1], col=alpha("blue",0.5), cex=0.8, t="l")
abline(v=idx_max, lty=2)
legend(
"topright",
legend=c("True value", "Estimated value"),
lty=c(1,1), col=c("black", "blue")
)
}
X_t <- matrix(X_test[idx_max,], ncol = ncol(gp_regression$X))
# Return:
return(X_t)
}
acqui_ucb <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
UseMethod("acqui_ucb", gp_regression)
}
# PI: ----
acqui_pi.gp_regression <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
list2env(gp_regression, envir = environment())
# Apply PI:
mu_star <- max(y)
z_score <- (predictions[,1] - mu_star - exploring_rate)/sqrt(predictions[,2])
value_estimate <- pnorm(z_score)
idx_max <- which.max(value_estimate)
if (verbose==3) {
points(fn_scale * predictions[,1], col=alpha("blue",0.5), cex=0.8, t="l")
abline(v=idx_max, lty=2)
legend(
"topright",
legend=c("True value", "Estimated value"),
lty=c(1,1), col=c("black", "blue")
)
}
X_t <- matrix(X_test[idx_max,], ncol = ncol(gp_regression$X))
# Return:
return(X_t)
}
acqui_pi <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
UseMethod("acqui_pi", gp_regression)
}
# EI: ----
acqui_ei.gp_regression <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
list2env(gp_regression, envir = environment())
# Apply EI:
mu_star <- max(y)
z_score <- (predictions[,1] - mu_star - exploring_rate)/sqrt(predictions[,2])
value_estimate <- (predictions[,1] - mu_star - exploring_rate) * pnorm(z_score) +
sqrt(predictions[,2]) * dnorm(z_score)
idx_max <- which.max(value_estimate)
if (verbose==3) {
points(fn_scale * predictions[,1], col=alpha("blue",0.5), cex=0.8, t="l")
abline(v=idx_max, lty=2)
# points(x=idx_max, y=fn_scale * predictions[idx_max,1], col="blue", cex=1.5, pch=16)
legend(
"topright",
legend=c("True value", "Estimated value"),
lty=c(1,1), col=c("black", "blue")
)
}
X_t <- matrix(X_test[idx_max,], ncol = ncol(gp_regression$X))
# Return:
return(X_t)
}
acqui_ei <- function(
gp_regression, exploring_rate=0.5, fn_scale=1, verbose=1
) {
UseMethod("acqui_ei", gp_regression)
}
Applying this function to our train and test data using the default parameters as before yields the following picture:
set.seed(rand_seed)
y <- gaussian_process(X=X)
gp_reg <- gaussian_process_regression(X=X, y=y, X_test = X_test)
plot(gp_reg)
ggsave("www/gp_reg.png", width = 7, height = 5)
Choosing instead a very low value for the length scale yield the following picture:
set.seed(rand_seed)
length_scale <- 0.3
signal_sd <- 1.08
noise_sd <- 0.00005
y <- gaussian_process(
X=X,
signal_sd=signal_sd,
length_scale=length_scale,
noise_sd = noise_sd
)
gp_reg <- gaussian_process_regression(
X=X, y=y, X_test = X_test,
signal_sd=signal_sd,
length_scale=length_scale,
noise_sd = noise_sd
)
plot(gp_reg)
Finally, choosing a relatively large value for the length scale, we get the following picture:
set.seed(rand_seed)
length_scale <- 3
signal_sd <- 1.16
noise_sd <- 0.89
y <- gaussian_process(
X=X,
signal_sd=signal_sd,
length_scale=length_scale,
noise_sd = noise_sd
)
gp_reg <- gaussian_process_regression(
X=X, y=y, X_test = X_test,
signal_sd=signal_sd,
length_scale=length_scale,
noise_sd = noise_sd
)
plot(gp_reg)
Here we will first briefly look at the implementation of Bayesian Optimization in R with UCB as the acquisition function. We shall first generate a small set of training points as well as a grid of pair-wise test point combinations.
Note: Because I have implemented everything from scratch and the hyperparameter optimization routine is computationally very expensive, I choose a testing grid that is not very granular. You can increase the
n_testparameter to use a finer grid.
set.seed(rand_seed)
x_lim <- c(-5,10)
y_lim <- c(0,15)
n_test <- 10
length_scale <- 1
signal_sd <- 1
noise_sd <- 0.1
hyper_params <- list(
length_scale=length_scale,
signal_sd=signal_sd,
noise_sd=noise_sd
)
n <- 5
x_1 <- runif(n, min = -5, max = 10)
x_2 <- runif(n, min = 0, max = 15)
X <- cbind(x_1,x_2) # training points
# Test points:
X_test <- as.matrix(
expand.grid(
x_1 = seq(min(x_lim),max(x_lim),length.out=n_test),
x_2 = seq(min(y_lim),max(y_lim),length.out=n_test)
)
)
With these inputs at hand, Bayesian Optimization can be implemented as follows:
library(R.utils)
bayesian_optimization <- function(
objective_fun=branin,
X,
X_test,
n_iter = 100,
hyper_params = list(
length_scale = 1,
signal_sd = 1,
noise_sd = 0.1
),
kernel_fun = kernel_se,
acquisition_fun = acqui_ucb,
exploring_rate = 0.5,
verbose = 1,
fn_scale = -1,
store_path = FALSE,
max_time_hyper = 100,
update_hyper_every = 1
) {
# Setup: ----
# Initial hyper parameters:
list2env(hyper_params, envir = environment())
y <- fn_scale * objective_fun(X) # initialize targets
counter <- 1
posterior <- NULL # to store posterior values
optima <- NULL # to store proposed optima
true_optimum <- fn_scale * max(fn_scale * objective_fun(X_test))
true_fun <- objective_fun(X_test) # true function values
loss <- as.matrix(rep(0, n_iter))
# Bayesian optimization: ----
while (counter <= n_iter) {
if(verbose %in% c(1,2,3)) {
message(sprintf("Iteration %i:", counter))
}
# Update hyperparameters:
update_this_round <- counter %% update_hyper_every == 0 # update hyper?
if (update_this_round) {
hyper_params <- tryCatch(
withTimeout(
optim_hyper(hyper_params, X=X, y=y, kernel_fun = kernel_fun),
timeout = max_time_hyper,
onTimeout = "error"
),
error = function(e) {
return(hyper_params)
}
)
}
if (verbose==3) {
# Plot true function values for X_test:
plot(
objective_fun(X_test),
cex=0.8, pch=18, t="l",
xlab="Point", ylab="output, f(x)"
)
points(
x=which.min(objective_fun(X_test)), y=min(objective_fun(X_test)),
pch=18,
cex=2
)
}
# Run GP regression:
args <- c(
list(X=X, y=y, X_test = X_test, kernel_fun = kernel_fun),
hyper_params
)
gp_reg <- tryCatch(
do.call(gaussian_process_regression, args = args),
error = function(e) {
return(gp_reg)
}
)
# Run acquisition function:
X_star <- acquisition_fun(
gp_reg, exploring_rate=exploring_rate, fn_scale=fn_scale,
verbose=verbose
)
X <- rbind(X, X_star) # store new data
y_t <- fn_scale * objective_fun(X_star)
y <- as.matrix(c(y,y_t)) # update target vector
loss_t <- (true_optimum - y_t)^2
loss[counter] <- loss_t # update loss
# Store posterior:
if (store_path) {
# Posterior
posterior_t <- data.table(gp_reg$predictions)
setnames(posterior_t, names(posterior_t), c("value", "uncertainty"))
posterior_t[,t:=counter]
posterior_t[,point:=1:.N]
posterior <- rbind(
posterior,
posterior_t
)
# Minimum
point <- which(true_fun==fn_scale * y_t)
optima_t <- data.table(point=point, value=fn_scale * y_t)
optima_t[,t:=counter]
optima <- rbind(optima, optima_t)
}
# Print results:
if(verbose==2) {
print("New point:")
print(X_star)
print("Function value:")
print(fn_scale * y_t)
}
counter <- counter + 1 # update counter
}
y <- fn_scale * y
path <- list(
posterior = posterior,
optima = optima
)
bayes_optimum <- list(
X_star = X_star,
X = X,
y = y,
X_test = X_test,
true_optimum = true_optimum,
loss = loss,
objective_fun = objective_fun,
path = path,
fn_scale = fn_scale
)
class(bayes_optimum) <- "bayes_optimum"
return(bayes_optimum)
}
library(ggplot2)
plot.bayes_optimum <- function(bayes_optimum) {
loss <- data.table(loss=c(bayes_optimum$loss))
loss[,t:=.I]
p <- ggplot(data=loss, aes(x=t, y=loss)) +
geom_line() +
labs(
x="T",
y="Loss"
)
p
return(p)
}
library(gganimate)
plot_path.bayes_optimum <- function(bayes_optimum) {
if (is.null(bayes_optimum$path$posterior)) {
stop("Path of posterior was not stored.")
}
list2env(bayes_optimum$path, envir = environment())
# Function path:
path <- copy(posterior)
path[,value:=value * bayes_optimum$fn_scale]
setnames(path, "value", "estimate")
true_y <- bayes_optimum$objective_fun(bayes_optimum$X_test)
ylim <- range(true_y)
ylim[1] <- ylim[1] * 0.9
ylim[2] <- ylim[2] * 1.1
path[,true:=rep.int(true_y,max(t))]
path[,point:=1:.N,by=.(t)]
path[,lb:=estimate-2*uncertainty]
path[,ub:=estimate+2*uncertainty]
path <- melt(path, id.vars = c("t", "point", "uncertainty"))
path[,uncertainty:=NULL]
point_estimate <- copy(path[variable %in% c("estimate", "true")])
# Confidence bands:
ci <- copy(path[variable %in% c("lb", "ub")])
ci[,id:=1]
ci[variable=="lb",point:=rev(point),by=.(t)]
ci[variable=="lb",value:=rev(value),by=.(t)]
# Minimum path:
y_star <- bayes_optimum$true_optimum
X_star <- which(bayes_optimum$objective_fun(bayes_optimum$X_test)==y_star)
true_optima <- data.table(
point=X_star,
value=y_star,
t=1:optima[,.N],
variable="true"
)
optima[,variable:="estimate"]
optima_path <- rbind(
true_optima,
optima
)
p <- ggplot() +
geom_polygon(
data=ci,
aes(x=point, y=value, group=id),
fill="blue",
alpha=0.1
) +
geom_line(
data=point_estimate,
aes(x=point, y=value, colour=variable)
) +
geom_point(
data=optima_path,
aes(x=point, y=value, colour=variable),
size=5,
pch=18
) +
scale_colour_manual(
name="Value:",
values = c("blue", "black")
) +
transition_states(
t,
transition_length = 3,
state_length = 1
) +
enter_fade() +
exit_fade() +
ease_aes('linear') +
labs(
title="Iteration: {closest_state}",
subtitle = "Evolution of learned function. Diamonds are true and estimated optima."
) +
coord_cartesian(ylim=ylim)
animate(p, width = 500, height = 300, fps = 3)
}
plot_path <- function(bayes_optimum) {
UseMethod("plot_path", bayes_optimum)
}
Hyperparameters for the GP are tuned through the following multi-started optimization routine:
# Hyper parameters: ----
optim_hyper <- function(
theta,
X,
y,
kernel_fun,
control = list(
maxit=50,
fnscale=-1
),
tol=1e-30,
min_par=1e-5,
eps=1e-30
) {
# Log likelihood:
lml <- function(theta, X, y, kernel_fun) {
# Unpack parameters:
theta <- as.list(theta)
list2env(theta, envir = environment())
n <- nrow(as.matrix(X))
# Covariance matrix:
K <- kernel_matrix(X, kernel_fun = kernel_fun, signal_sd=signal_sd, length_scale=length_scale)
K_y <- K + noise_sd^2 * diag(n)
L <- t(chol(K_y + eps)) # Cholesky decompose
alpha <- as.matrix(solve(crossprod(t(L)),y,tol=tol))
# Compute log marginal likelihood:
lml <- (- (crossprod(y,alpha))/2 - sum(log(diag(L))) - (n/2) * log(2*pi))
return(lml)
}
# Multi-started optimization: ----
multi_starts <- list(
theta,
lapply(theta, function(i) 1), # try all ones
lapply(theta, function(i) 10), # try all 10s
lapply(theta, function(i) runif(1, min = min_par, max=1)), # try all uniform [min,1]
lapply(theta, function(i) runif(1, min = min_par, max=10)), # try all uniform [min,10]
lapply(theta, function(i) runif(1, min = min_par, max=100)) # try all uniform [min,100]
)
# Run:
multi_optim <- lapply(
multi_starts,
function(theta) {
# Optimal values
optim_output <- optim(
par=theta,
X=X, y=y, kernel_fun=kernel_fun,
fn=lml,
control=control,
method="L-BFGS-B",
lower=rep(1e-5,length(theta)),
upper=rep(Inf,length(theta))
)
optim_value <- optim_output$value
optim_theta <- optim_output$par
return(
list(
value = optim_value,
theta = optim_theta
)
)
}
)
# Output:
idx_max <- which.max(sapply(multi_optim, function(i) i$value))
theta <- multi_optim[[idx_max]]$theta
return(theta)
}
The code below runs Bayesian Optimization for a number of iterations:
n_iter <- 50
bayes_optimum <- bayesian_optimization(
objective_fun=branin,
X=X,
X_test = X_test,
n_iter=n_iter,
hyper_params = hyper_params,
verbose = 3,
exploring_rate=0.5,
fn_scale = -1,
store_path = TRUE
)
saveRDS(bayes_optimum, file="results/bayes_optimum.rds")
bayes_optimum <- readRDS("results/bayes_optimum.rds")
Figure 2.1 shows the loss as a function of the number of iterations. Evidently, the Bayes Optimization gradually learns the true optimum of the function as it learnds from the observed data.
plot(bayes_optimum)
Figure 2.1: Loss over time.
plot_path(bayes_optimum)
anim_save("www/bayes_opt.gif")
Let us have a quick look at what is actually going on under the hood. Figure 2.2 shows in black the true function values evaluated at the pair-wise test point combinations. The blue line reflects the corresponding posterior means that the Bayes Optimizer learns through time. The blue shaded area indicates the 95% confidence interval based on the learned posterior variances.
The Bayes Optizer gradually gets better at estimated the true function values. As it explores different points on the test grid uncertainty around these points shrinks. Sometimes the overall magnitude of the confidence interval suddenly appears to change which corresponds to occasions when the estimates of optimal hyperparameters change significantly. Eventually the learned function values are very close to true function values and the proposed optimum corresponds to the true optimum (among the test points).
knitr::include_graphics("www/bayes_opt.gif")
Figure 2.2: The learning path of the algorithm.
n_iter <- 100
update_hyper_every <- 5
exploring_rates <- c(0.001,0.5,100)
acqui_funs <- list(
ucb = acqui_ucb,
pi = acqui_pi,
ei = acqui_ei
)
grid <- CJ(explore=exploring_rates, acqui=names(acqui_funs))
sim_output <- lapply(
1:nrow(grid),
function(i) {
time_start <- Sys.time()
list2env(c(grid[i,]), envir = environment()) # grid values
print(sprintf(
"Running for: exploring rate %0.3f, acquisition %s",
explore,
acqui
))
bayes_optimum <- bayesian_optimization(
objective_fun=branin,
X=X,
X_test = X_test,
n_iter=n_iter,
hyper_params = hyper_params,
verbose = 1,
exploring_rate=explore,
fn_scale = -1,
store_path = TRUE,
acquisition_fun = acqui_funs[[acqui]],
update_hyper_every = update_hyper_every,
max_time_hyper = 120
)
print(sprintf("Time spent on first sim:"))
print(Sys.time()-time_start)
return(
list(
output = bayes_optimum,
exploring_rate = explore,
acqui_fun = acqui
)
)
}
)
saveRDS(sim_output, file="results/bayes_opt_sim.rds")
Finally, Figure ?? shows the loss evolution with different acquisition functions for varying exploration parameters (by row). For UCB and EI the results are intuitive with the former performing better overall: as the exploration rate increases they are more likely to explore and hence at times commit errors. For PI the results are poor across the board. The choices of the eploration rate may be too high for PI. Another explanation may be that due to computational constraints, I have run the multi-started hyperparameter optimization only every five iterations.
sim_output <- readRDS("results/bayes_opt_sim.rds")
sim_output <- rbindlist(
lapply(
sim_output,
function(i) {
loss <- data.table(
iter = 1:length(c(i$output$loss)),
loss = c(i$output$loss),
explore = i$exploring_rate,
acqui = i$acqui_fun
)
return(loss)
}
)
)
p <- ggplot(data=sim_output, aes(x=iter, y=loss, colour=acqui)) +
geom_line() +
facet_grid(rows = vars(explore)) +
scale_color_discrete(
name="Acquisition function:"
) +
labs(
x="Iteration",
y="Loss"
)
p
Figure 2.3: Comparison of loss evolution with different acquisition functions for varying exploration parameters (by row).